import os
import json
import torch
from openai import AzureOpenAI
from tqdm import tqdm
import sys
import re
import base64
import cv2
from PIL import Image
from io import BytesIO
import random
import pandas as pd
import numpy as np
import argparse
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel,
    AutoProcessor,
    AutoModelForVision2Seq,
)
from transformers import MllamaForConditionalGeneration

parser = argparse.ArgumentParser()
parser.add_argument('--count_personas', action='store_true', help='Output the number of personas')
parser.add_argument('--start', type=int, default=0, help='Start index for dataset slicing')
parser.add_argument('--end', type=int, default=None, help='End index for dataset slicing (inclusive)')
parser.add_argument('--output_dir', type=str, default='results', help='Directory to save per-job JSON outputs')
parser.add_argument('--runs_list', type=str, default='20,40,60,80,100', help='Comma-separated list indicating how many times to repeat each persona prediction (e.g., "20,40,60")')
args = parser.parse_args()

# After parsing args
output_dir = os.path.abspath(args.output_dir)
os.makedirs(output_dir, exist_ok=True)
args.output_dir = output_dir  # overwrite to absolute for consistency

persona_prompts = {
    "18-24_female": """You are a woman aged 18–24. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design. Websites that are cluttered, generic, or try-hard are less likely to appeal to you.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

Return:
Reason: [Why this website does or doesn't appeal to you visually and emotionally]
Answer: [0–10] ← You must include this score.""",

    "18-24_male": """You are a man aged 18–24. You're used to fast-scroll content and visual punch—memes, Twitch, TikTok, YouTube. You like websites that grab attention fast: bold layouts, smart design, or a bit of edge. If a website feels outdated, cluttered, or boring, it loses your interest quickly.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on visuals, usability, and vibe.

Return:
Reason: [Your opinion on whether the website looks cool, clean, boring, or chaotic]
Answer: [0–10] ← You must include this score.""",

    "25-34_female": """You are a woman aged 25–34. You appreciate modern, polished websites that feel aligned with your lifestyle—whether it's wellness, creativity, relationships, or career. You like clean layouts, elegant color palettes, and visuals that are both pretty and purposeful.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design, clarity, aesthetics, and content.

Return:
Reason: [Explain what makes it feel appealing, elegant, or uninviting]
Answer: [0–10] ← You must include this score.""",

    "25-34_male": """You are a man aged 25–34. You value strong, clear, and modern visuals. You're likely to appreciate websites that are bold but not messy—clean grids, high contrast, sharp fonts, and relevant content (fitness, tech, ambition, money).

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

Return:
Reason: [Why this design works for you—or not]
Answer: [0–10] ← You must include this score.""",

    "35-44_female": """You are a woman aged 35–44. You're drawn to websites that are intentional, emotionally intelligent, and visually clean. Family, meaning, and beauty in simplicity appeal to you more than trend-driven clutter.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

Return:
Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic]
Answer: [0–10] ← You must include this score.""",

    "35-44_male": """You are a man aged 35–44. You like websites that are grounded, practical, and cleanly designed. Strong layouts, good use of space, and purpose-driven content grab your attention more than visual noise.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on structure, relevance, and visual balance.

Return:
Reason: [Explain what makes this site feel appealing or forgettable]
Answer: [0–10] ← You must include this score.""",

    "45-54_female": """You are a woman aged 45–54. You like websites that are calm, clear, and visually composed. Design that feels warm, thoughtful, and emotionally grounded appeals more than flashy visuals or trendy noise.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on clarity, emotional tone, and visual presentation.

Return:
Reason: [Why this website would—or wouldn't—feel pleasant and worth staying on]
Answer: [0–10] ← You must include this score.""",

    "45-54_male": """You are a man aged 45–54. You prefer websites that are easy to navigate, focused, and visually grounded. You're drawn to sites that reflect purpose and clarity over trend or flash.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on usability, structure, and message.

Return:
Reason: [What stood out to you—positively or negatively—in its design or layout]
Answer: [0–10] ← You must include this score.""",

    "55+_female": """You are a woman aged 55 or older. You appreciate websites that feel meaningful, visually calm, and easy to understand. Gentle color palettes, clear fonts, and emotionally warm content make a big difference.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design simplicity and emotional tone.

Return:
Reason: [How the design made you feel—comforted, confused, interested, or indifferent]
Answer: [0–10] ← You must include this score.""",

    "55+_male": """You are a man aged 55 or older. You value websites that are straightforward, honest, and easy to engage with. Flashy or cluttered pages can feel frustrating, while clear structure and meaningful content feel worthwhile.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on clarity, usefulness, and visual comfort.

Return:
Reason: [What you liked—or disliked—about the way this site is designed]
Answer: [0–10] ← You must include this score."""
}

# persona_prompts = {
#     "18-24_female": """You are a woman aged 18–24. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design. Websites that are cluttered, generic, or try-hard are less likely to appeal to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return:
# Reason: [Why this website does or doesn't appeal to you visually and emotionally]
# Answer: [0–10] ← You must include this score.""",

#     "25-34_male": """You are a man aged 25–34. You value strong, clear, and modern visuals. You're likely to appreciate websites that are bold but not messy—clean grids, high contrast, sharp fonts, and relevant content (fitness, tech, ambition, money).

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return:
# Reason: [Why this design works for you—or not]
# Answer: [0–10] ← You must include this score.""",

#     "35-44_female": """You are a woman aged 35–44. You're drawn to websites that are intentional, emotionally intelligent, and visually clean. Family, meaning, and beauty in simplicity appeal to you more than trend-driven clutter.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return:
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic]
# Answer: [0–10] ← You must include this score.""",

#     "45-54_male": """You are a man aged 45–54. You prefer websites that are easy to navigate, focused, and visually grounded. You're drawn to sites that reflect purpose and clarity over trend or flash.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on usability, structure, and message.

# Return:
# Reason: [What stood out to you—positively or negatively—in its design or layout]
# Answer: [0–10] ← You must include this score.""",

#     "55+_female": """You are a woman aged 55 or older. You appreciate websites that feel meaningful, visually calm, and easy to understand. Gentle color palettes, clear fonts, and emotionally warm content make a big difference.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design simplicity and emotional tone.

# Return:
# Reason: [How the design made you feel—comforted, confused, interested, or indifferent]
# Answer: [0–10] ← You must include this score.""",
# }

# persona_prompts = {
#     "18-24_female_v1": """You are a woman aged 18–24 with an annual income over $100K. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

#     "18-24_female_v2": """You are a woman aged 18–24 with an annual income under $30,000. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

#     "18-24_male_v1": """You are a man aged 18–24 with an annual income over $100K. You're used to fast-scroll content and visual punch—memes, Twitch, TikTok, YouTube. You like websites that grab attention fast and look slick or high-end.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on visuals, usability, and vibe.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Your opinion on whether the website looks cool, clean, boring, or chaotic in minimal words]""",

#     "18-24_male_v2": """You are a man aged 18–24 with an annual income under $30,000. You're used to fast-scroll content and visual punch—memes, Twitch, TikTok, YouTube. You like websites that grab attention fast and look slick or high-end.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on visuals, usability, and vibe.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Your opinion on whether the website looks cool, clean, boring, or chaotic in minimal words]""",

#     "25-34_female_v1": """You are a woman aged 25–34 with an annual income over $100K. You appreciate polished, aspirational websites that match your lifestyle—wellness, creativity, relationships, career. You value clarity, taste, and brand maturity.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design, clarity, aesthetics, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes it feel appealing, elegant, or uninviting in minimal words]""",

#     "25-34_female_v2": """You are a woman aged 25–34 with an annual income under $30,000. You appreciate polished, aspirational websites that match your lifestyle—wellness, creativity, relationships, career. You value clarity, taste, and brand maturity.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design, clarity, aesthetics, and content.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes it feel appealing, elegant, or uninviting in minimal words]""",

#     "25-34_male_v1": """You are a man aged 25–34 with an annual income over $100K. You value websites that feel sharp, modern, and confident. Bold layouts, strong CTAs, high-quality visuals—especially in tech, money, or fitness—stand out to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this design works for you—or not in minimal words]""",

#     "25-34_male_v2": """You are a man aged 25–34 with an annual income under $30,000. You value websites that feel sharp, modern, and confident. Bold layouts, strong CTAs, high-quality visuals—especially in tech, money, or fitness—stand out to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Why this design works for you—or not in minimal words]""",

#     "35-44_female_v1": """You are a woman aged 35–44 with an annual income over $100K. You appreciate websites that are refined, emotionally intelligent, and thoughtfully curated. You value elegance and simplicity, especially when paired with a sense of emotional resonance and purpose.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic in minimal words]""",

#     "35-44_female_v2": """You are a woman aged 35–44 with an annual income under $30,000. You appreciate websites that are refined, emotionally intelligent, and thoughtfully curated. You value elegance and simplicity, especially when paired with a sense of emotional resonance and purpose.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic in minimal words]""",

#     "35-44_male_v1": """You are a man aged 35–44 with an annual income over $100K. You expect websites to be efficient, modern, and professionally designed. Visual clarity, smart layout, and purposeful messaging signal value to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on structure, relevance, and visual balance.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes this site feel appealing or forgettable in minimal words]""",

#     "35-44_male_v2": """You are a man aged 35–44 with an annual income under $30,000. You expect websites to be efficient, modern, and professionally designed. Visual clarity, smart layout, and purposeful messaging signal value to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on structure, relevance, and visual balance.

# Return your response in this exact format:
# Answer: [0–10]
# Reason: [Explain what makes this site feel appealing or forgettable in minimal words]""",
# }




def frame_to_data_url(frame_bgr):
    # Convert the BGR frame (OpenCV format) to RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Convert the RGB frame to a PIL Image
    image = Image.fromarray(frame_rgb)
    image = image.resize((256, 256), Image.LANCZOS)
    # Create a BytesIO buffer to hold the image data
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    buffered.seek(0)

    # Encode the image data in base64
    base64_encoded_data = base64.b64encode(buffered.read()).decode('utf-8')

    # Construct the data URL
    return f"data:image/jpeg;base64,{base64_encoded_data}"

# -----------------------------
# Azure OpenAI Configuration (GPT models)
# -----------------------------
api_version = "2024-02-15-preview"
config_dict = {
    'api_key': "YOUR_OPENAI_API_KEY",
    'api_version': api_version,
    'azure_endpoint': "https://your-azure-openai-endpoint/"
}

# -----------------------------
# Local LLMs (Qwen & Llama) setup
# -----------------------------

# Cache to avoid re-loading models multiple times
_LOCAL_MODEL_CACHE = {}


def _load_local_model(model_key: str):
    """Lazy-load and cache local HF models.

    Args:
        model_key: Either "qwen" or "llama".
    Returns:
        Tuple (model, tokenizer)
    """

    if model_key in _LOCAL_MODEL_CACHE:
        return _LOCAL_MODEL_CACHE[model_key]

    if model_key == "qwen":
        model_name = "Qwen/Qwen2.5-VL-72B-Instruct"
        print("[INFO] Loading Qwen2.5-VL-72B-Instruct vision model …")
        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForVision2Seq.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto",
            load_in_4bit=True,
            trust_remote_code=True,
        )
    elif model_key == "llama":
        model_name = "meta-llama/Llama-3.2-90B-Vision"
        print("[INFO] Loading meta-llama/Llama-3.2-90B-Vision …")
        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
        model = MllamaForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto",
            load_in_4bit=True,
            trust_remote_code=True,
        )
    else:
        raise ValueError(f"Unknown model_key {model_key}")

    _LOCAL_MODEL_CACHE[model_key] = (model, processor)
    return _LOCAL_MODEL_CACHE[model_key]


def _cv2_to_pil(img_bgr):
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    return Image.fromarray(img_rgb)


def verbalize_local(model_key: str, full_prompt: str, target_cv2_img) -> str:
    """Generate a completion with local Vision LLM.

    Args:
        model_key: "qwen" or "llama"
        full_prompt: text prompt including persona and user instructions
        target_cv2_img: BGR image loaded via cv2 (target website screenshot)
    Returns: decoded string response
    """
    model, processor = _load_local_model(model_key)

    if model_key == "qwen":
        # Build chat-style message with image placeholder
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": full_prompt},
                ],
            }
        ]
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=350, temperature=0.85, do_sample=True)

        response = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])
        return response.strip()

    else:  # llama
        target_pil = _cv2_to_pil(target_cv2_img)
        prompt = "<|image|><|begin_of_text|>" + full_prompt
        inputs = processor(target_pil, prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=350, temperature=0.85, do_sample=True)

        response = processor.decode(outputs[0])
    return response.strip()


# -----------------------------
# LLM assignment per-persona
# -----------------------------

# Cycle through GPT → Qwen → Llama for each persona key as requested.
_persona_keys = list(persona_prompts.keys())
_llm_cycle = ["gpt", "qwen", "llama"]
PERSONA_TO_LLM = {
    key: _llm_cycle[idx % len(_llm_cycle)] for idx, key in enumerate(_persona_keys)
}

# --------------------------------------------------------------
# Pre-load open-source vision models before any inference starts
# --------------------------------------------------------------

print("[INFO] Pre-loading local vision LLMs (Qwen & Llama)…  This may take a few minutes.")
_load_local_model("qwen")
_load_local_model("llama")
print("[INFO] Local models loaded. Starting evaluation.")

# --------------------------------------------------------------------------------

def create_persona_system_prompt(persona_specification):
    """Create a system prompt based on the agent's persona specification"""
    # Extract the first sentence as the short description
    first_sentence = persona_specification.split('.')[0] + '.'
    
    # COMMENTED OUT - Original system prompt
    return f"""You are {first_sentence}
    
    {persona_specification}
    
    You are evaluating website aesthetics and design quality. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content, considering your unique background, personality, and preferences.
    
    You can provide precise scores including decimal values (e.g., 7.5, 8.2) to better reflect your nuanced judgment.
    
    You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot.
    
    Return:
    Reason: [Explain your reaction based on your background and preferences]
    Answer: [0–10] ← You must include this score."""
    
    # NEW ENHANCED SYSTEM PROMPT WITH COT AND STRUCTURED EVALUATION
#     return f"""You are {first_sentence}

# {persona_specification}

# You are an expert website aesthetic evaluator. You will be shown 5 example websites with their likeability scores (0-10 scale), followed by a target website to evaluate.

# EVALUATION METHODOLOGY:
# Use this step-by-step chain-of-thought approach to systematically evaluate the target website:

# STEP 1 - FIRST IMPRESSION ANALYSIS:
# - What is your immediate emotional reaction to this website?
# - Does it feel modern, professional, outdated, cluttered, or clean?
# - How does it compare to current web design trends?

# STEP 2 - TECHNICAL DESIGN ASSESSMENT:
# - Visual Hierarchy: How well does the layout guide your eye? (Rate 1-10)
# - Color Harmony: How pleasing and cohesive is the color scheme? (Rate 1-10)
# - Typography: How readable and aesthetically pleasing are the fonts? (Rate 1-10)
# - Layout Balance: How well-balanced and organized is the content? (Rate 1-10)
# - Visual Appeal: How attractive is the overall design? (Rate 1-10)

# STEP 3 - CONTEXTUAL COMPARISON:
# - Compare this website to the 5 examples you were shown
# - Which example website is it most similar to in quality?
# - Is it better or worse than that example, and by how much?

# STEP 4 - PERSONAL PREFERENCE INTEGRATION:
# - Based on your background and expertise described above, how does this align with your aesthetic preferences?
# - What specific elements appeal to or displease you personally?

# STEP 5 - FINAL SYNTHESIS:
# - Average your technical scores from Step 2
# - Adjust based on your personal preferences (+/- 1-2 points)
# - Consider the comparative context from Step 3
# - Provide your final score with decimal precision (e.g., 7.3, 8.7)

# You can provide precise scores including decimal values to reflect nuanced judgment.

# REQUIRED OUTPUT FORMAT:
# First Impression: [Your immediate reaction]
# Technical Analysis: [Brief analysis with sub-scores for each dimension]
# Comparison: [How it compares to the examples]
# Personal Perspective: [Your unique viewpoint based on your background]
# Final Reasoning: [Synthesis of all factors]
# Answer: [Your final score 0-10 with decimals] ← You must include this numerical score."""

def get_json_data_generate(sys_prompt, user_prompt, images):
    # images: list of (data_url, score) tuples, last one is the target
    # Build the message with all images
    user_content = [{"type": "text", "text": user_prompt}]
    for idx, (img_url, score) in enumerate(images):
        if idx < len(images) - 1:
            # Example images
            user_content.append({
                "type": "image_url",
                "image_url": {"url": img_url, "detail": "low"},
                "score": f"{score:.2f}"
            })
        else:
            # The image to be scored
            user_content.append({
                "type": "image_url",
                "image_url": {"url": img_url, "detail": "high"}
            })
    return {
        "messages": [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_content}
        ]
    }

def verbalize(prompt, sys_prompt, images):
    json_data = get_json_data_generate(sys_prompt, prompt, images)
    client = AzureOpenAI(
        api_key=config_dict['api_key'],
        api_version=config_dict['api_version'],
        azure_endpoint=config_dict['azure_endpoint'],
    )
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=json_data["messages"],
        max_tokens=350,
        temperature=0.85,
        n=1
    )
    return response.choices[0].message.content.strip()

# Parse runs_list into integer list
runs_list = [int(x) for x in args.runs_list.split(',') if x.strip()]
runs_list = [r for r in runs_list if r > 0]
if not runs_list:
    raise ValueError("--runs_list must contain at least one positive integer")

# Load test data
test_filename = "/path/to/test_list.csv"
df = pd.read_csv(test_filename)

# Determine slice
start_idx = args.start
end_idx = args.end if args.end is not None else df.shape[0] - 1

# =========================
# MAIN EVALUATION LOOP WITH REPETITIONS & PERSONAS
# =========================

from tqdm import tqdm  # ensure imported even if earlier

for n_runs in tqdm(runs_list, desc="Run counts", position=0):
    print("\n" + "="*80)
    print(f"Running evaluation with {n_runs} repetitions per persona…")
    print("="*80)

    output_file = os.path.join(
        output_dir,
        f"forecast_results_runs{n_runs}_{args.start}_{args.end if args.end is not None else 'end'}.jsonl",
    )

    # If the output file exists from a previous run, it will be overwritten.
    # We will append to it in the loop.
    if os.path.exists(output_file):
        os.remove(output_file)

    for i in tqdm(range(start_idx, end_idx + 1), desc=f"Datapoints for n_runs={n_runs}", position=0, leave=True):
        try:
            d = df.iloc[i]
            value = d.to_dict()

            # Prepare target image once
            image_path = '/path/to/images/'+d['image'].replace('_resized','')
            image = cv2.imread(image_path)
            image_url = frame_to_data_url(image)

            persona_predictions = {}
            persona_mean_values = []

            for persona_key, persona_spec in persona_prompts.items():
                llm_type = PERSONA_TO_LLM.get(persona_key, "gpt")

                rep_predictions = []

                for _ in tqdm(range(n_runs), desc=f"{persona_key[:12]} reps", position=1, leave=False):
                    # Sample 5 random example images & scores
                    other_indices = list(range(df.shape[0]))
                    other_indices.remove(i)
                    sample_indices = random.sample(other_indices, min(5, len(other_indices)))

                    example_lines = []
                    example_images = []
                    for idx in sample_indices:
                        row = df.iloc[idx]
                        fname = row['image']
                        score = row['mean_score']
                        img_path = '/path/to/images/'+fname.replace('_resized','')
                        img = cv2.imread(img_path)
                        img_url = frame_to_data_url(img)
                        example_lines.append(f"Score: {score:.1f}")
                        example_images.append((img_url, score))

                    # Append target image (for GPT vision models)
                    example_images.append((image_url, None))
                    examples_text = "\n".join(example_lines)

                    prompt_text = (
                        "Given the images below, the first 5 are example website screenshots "
                        "with their likeability scores (on a 0–10 scale, see the list below). "
                        "The last image is the one you should score. Based on your background "
                        "and preferences, carefully consider the last image and give a score "
                        "between 0 to 10 based on how much you like the website's visual design, "
                        "layout, colors, and content.\n\nHere are 5 example likeability scores (in order):\n" + examples_text
                    )

                    try:
                        if llm_type == "gpt":
                            sys_prompt = create_persona_system_prompt(persona_spec)
                            resp = verbalize(prompt_text, sys_prompt, example_images)
                        else:
                            # Local Vision LLM path – prepend persona spec and feed target image
                            full_prompt = f"{persona_spec}\n\n{prompt_text}"
                            resp = verbalize_local(llm_type, full_prompt, image)

                        number_matches = re.findall(r'Answer:\s*(\d+(?:\.\d+)?)', resp)
                        pred_value = float(number_matches[-1]) if number_matches else None
                    except Exception as e:
                        print(f"Error persona {persona_key} repetition: {e}")
                        pred_value = None

                    if pred_value is not None:
                        rep_predictions.append(pred_value)

                # compute persona mean
                persona_mean = float(np.mean(rep_predictions)) if rep_predictions else None
                persona_predictions[persona_key] = {
                    "predictions": rep_predictions,
                    "mean_prediction": persona_mean
                }
                if persona_mean is not None:
                    persona_mean_values.append(persona_mean)

            # Average across personas
            overall_mean_prediction = float(np.mean(persona_mean_values)) if persona_mean_values else None

            result_record = value.copy()
            result_record.update({
                "persona_predictions": persona_predictions,
                "overall_mean_prediction": overall_mean_prediction,
                "ground_truth": d['mean_score']
            })

            # Incremental save to JSONL file (append mode)
            try:
                with open(output_file, 'a') as f_out:
                    f_out.write(json.dumps(result_record) + '\n')
            except Exception as e:
                print(f"Warning: Could not write incremental JSONL: {e}")

        except Exception as e:
            print(f"Error processing row {i}: {e}")
            continue

    print(f"Evaluation for n_runs={n_runs} completed. Results saved to {output_file}")